Higgs dataset experiment

Researcher: Kirell Benzi, Benjamin Ricaud, Pierre Vandergheynst

From the paper: Principal Patterns on Graph

The dataset is available at SNAP.



In [15]:

    
import os
import time
import numpy as np
import scipy as sp
import pandas as pd
import networkx as nx
import datetime as dt
import itertools
import operator
import community
import matplotlib as mpl
import matplotlib.pyplot as plt
from collections import Counter

import IPython.utils.path
DATA_DIR = os.path.join(IPython.utils.path.get_home_dir(), 'data/higgs/')
print 'Data directory:', DATA_DIR
dataset_name = 'higgs'


# Install this from https://github.com/kikohs/sptgraph
import sptgraph
%load_ext autoreload
%autoreload 2


# Customize plot colors for dark backgrounds
%matplotlib inline
mpl.rcParams['axes.edgecolor'] = 'grey'
mpl.rcParams['grid.color'] = '#66CCCC'
mpl.rcParams['text.color'] = '#0EBFE9'
mpl.rcParams['xtick.color'] = '#66CCCC'
mpl.rcParams['ytick.color'] = '#66CCCC'
mpl.rcParams['axes.labelcolor'] = '#0EBFE9'









    



Data directory: /home/ubuntu/data/higgs/
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Optionnal use bokeh to plot interactive components



In [ ]:

    
from bokeh.charts import Bar, show
from bokeh.plotting import output_notebook
output_notebook()



In [ ]:

    
def plot_spatial_spread_graph(comp):
    nodes = sorted(comp.nodes(data=True), key=lambda x: x[1]['layer'])
    group_hist = []
    layers = []
    for layer, n in itertools.groupby(nodes, key=lambda x: x[1]['layer']):
        elems = list(n)
        # Add timestamp instead of layer
        # Bug in bokehJs, cannot use colons ...
        t = elems[0][1]['timestamp'].strftime('%d-%m-%Y %H\'%M\'%S')
        layers.append(t)
        group_hist.append(len(elems))
    
    bar = Bar(np.array(group_hist), layers, title="Spatial spread of component " + str(comp.name), stacked=False)
    return bar

Helper functions



In [3]:

    
def round_df(df, freq, unit):
    mul = 1
    if unit == 'min':
        mul = 60
        unit = 'Min'

    elif unit == 'hour':
        mul = 60 * 60
        unit = 'H'

    elif unit == 'day':
        mul = 60 * 60 * 24
        unit = 'D'
    else:
        mul = 1
        unit = 'S'

    ns_min = freq * mul * 1000000000
    idx = pd.DatetimeIndex(((df.index.astype(np.int64) // ns_min) * ns_min))
    idx.name = df.index.name
    df.index = idx
    return df, unit


def create_layers(df, freq, unit):
    # Trick: use pandas resample to generate continuous indexes
    dfR = df.resample(str(freq) + unit) 
    layer_map = dict(itertools.izip(dfR.index.astype(np.int64), itertools.count()))
    df['layer'] = np.vectorize(lambda x: layer_map[x])(df.index.astype(np.int64))
    # delta between layers (timedelta)
    delta = dfR.index[1] - dfR.index[0]
    return df, delta
    
    
def create_layered_graph(input_df, freq=1, unit='min', action='RT', social_graph=None):
        
    h = nx.DiGraph()
    h.name = 'Higgs layered ' + action + ' - ' + str(freq) + ' ' + unit
    
    df = input_df.copy()
    if action != '' and action is not None:
        df = df[df['action'] == action]
    
    df, unit = round_df(df, freq, unit)
    df, delta_ts = create_layers(df, freq, unit)
    # Maximum id in whole dataframe, min should not be 0
    max_id = np.max(np.max(df[['src_id', 'tgt_id']]).values)
    
    for idx, row in df.iterrows():
        base_src = row['src_id']
        base_tgt = row['tgt_id']
        layer = row['layer']
        act = row['action']
              
        src = base_src + (layer * max_id)
        tgt = base_tgt + ((layer + 1) * max_id)
        
        # Add nodes
        h.add_node(src, {'base_id': base_src, 'layer': layer, 'timestamp': idx})
        h.add_node(tgt, {'base_id': base_tgt, 'layer': layer + 1, 'timestamp': idx + delta_ts})
        
        # Add edge
        e_d = {'action': act, 'timestamp': idx}
        if social_graph is not None:
            if social_graph.has_edge(base_src, base_tgt):
                e_d['is_social'] = True
            else:
                e_d['is_social'] = False
            
        h.add_edge(src, tgt, e_d)
        
    return h, df


def extract_components(h):
    components = filter(lambda x: x.number_of_edges() >= 1 and x.number_of_nodes() >= 2, 
                    nx.weakly_connected_component_subgraphs(h))
    res = []
    for i, comp in enumerate(components):
        comp.name = i
        res.append({'component': comp})
        
    df = pd.DataFrame(res)
    df.index.name = 'component_id'
    return df


def enrich_components(df, with_social=False):
    
    def get_period_span(c):
        ts = sorted(nx.get_node_attributes(c, 'timestamp').values())
        return (ts[0], ts[-1])
    
    def get_social_edge_ratio(c):
        social_edges = sum(nx.get_edge_attributes(c, 'is_social').values())  # True == 1, False 0
        return social_edges / float(c.number_of_edges())
        
    df['node_count'] = df['component'].apply(lambda x: x.number_of_nodes())
    df['edge_count'] = df['component'].apply(lambda x: x.number_of_edges())
    df['height'] = df['component'].apply(lambda x: len(np.unique(nx.get_node_attributes(x, 'base_id').values())))
    df['width'] = df['component'].apply(lambda x: len(np.unique(nx.get_node_attributes(x, 'layer').values())))
    period_series = df['component'].apply(get_period_span)
    df['start'] = period_series.apply(lambda x: x[0])
    df['end'] = period_series.apply(lambda x: x[1])
    df['social_ratio'] = df['component'].apply(get_social_edge_ratio)
    
    return df.sort('node_count', ascending=False)


def create_activated_components(input_df, freq=1, unit='min', action='RT', social_graph=None):
    h, _ = create_layered_graph(input_df, freq, unit, action, social_graph)
    comp_df = extract_components(h)
    with_social = True if social_graph is not None else False
    return enrich_components(comp_df, with_social)


def create_graph_from_activity(activity_df, action='RT'):
    g = nx.DiGraph()
    df = activity_df[activity_df['action'] == action]
    g.name = 'Higgs ' + action
    for idx, d in df.iterrows():
        src = d['src_id']
        tgt = d['tgt_id']
        if not g.has_edge(src, tgt):
            g.add_edge(src, tgt, weight=1)
        else:
            g[src][tgt]['weight'] += 1

    return g


def overlap_graph(g1, g2):
    common_edges = 0
    for u, v in g1.edges_iter():
        if g2.has_edge(u, v):
            common_edges += 1

    res = common_edges * 100 / float(nx.number_of_edges(g1))
    print 'Percentage of overlap (', g1.name, ',', g2.name, '):', res
    print 'Number of common edges:', common_edges
    return res, common_edges


def parse_activity(path, reverse=True):
    """if reverse is True. edges to keep the causality from cause to effet"""
    names = ['src_id', 'tgt_id', 'timestamp', 'action']
    if reverse:
        names = ['tgt_id', 'src_id', 'timestamp', 'action']
        
    df = pd.read_csv(path, sep=' ', header=None, names=names,
                     dtype={'src_id': np.int64, 'tgt_id': np.int64, 'timestamp': np.int64, 'action': str},
                     index_col=2)
    
    df['action'] = df['action'].astype(str)
    df.index = df.index.astype('datetime64[s]')
    df.index.name = 'timestamp'
    return df

Parse data

Parse retweet graph



In [ ]:

    
start = time.time()
# path = os.path.join(DATA_DIR, 'HiggsDiscovery_RT.edges.gz')
# RETWEET = nx.read_edgelist(path, create_using=nx.DiGraph(),
#                            nodetype=int, data=(('weight', int),))
# RETWEET.name = 'Higgs RT'
# nx.write_gpickle(RETWEET, os.path.join(DATA_DIR, 'retweet.gpickle'))
RETWEET = nx.read_gpickle(os.path.join(DATA_DIR, 'retweet.gpickle'))
print 'Retweet graph loaded in:', time.time() - start
print nx.info(RETWEET)

Parse mention graph



In [ ]:

    
start = time.time()
# MENTION = nx.read_edgelist(os.path.join(DATA_DIR, 'HiggsDiscovery_MT.edges.gz'), 
#                            create_using=nx.DiGraph(), nodetype=int, data=(('weight', int),))
# MENTION.name = 'Higgs MT'
# nx.write_gpickle(MENTION, os.path.join(DATA_DIR, 'mention.gpickle'))

MENTION = nx.read_gpickle(os.path.join(DATA_DIR, 'mention.gpickle'))
print 'Mention graph loaded in:', time.time() - start
print nx.info(MENTION)

Parse reply graph



In [ ]:

    
start = time.time()
# REPLY = nx.read_edgelist(os.path.join(DATA_DIR, 'HiggsDiscovery_RE.edges.gz'),
#                          create_using=nx.DiGraph(), nodetype=int, data=(('weight', int),))
# REPLY.name = 'Higgs RE'
# nx.write_gpickle(REPLY, os.path.join(DATA_DIR, 'reply.gpickle'))

REPLY = nx.read_gpickle(os.path.join(DATA_DIR, 'reply.gpickle'))
print 'Reply graph loaded in:', time.time() - start
print nx.info(REPLY)



In [ ]:

    
start = time.time()
# SOCIAL = nx.read_edgelist(os.path.join(DATA_DIR, 'HiggsDiscovery_social.edges.gz'),
#                           create_using=nx.DiGraph(), nodetype=int, )
# SOCIAL.name = 'Higgs SOCIAL'
# nx.write_gpickle(SOCIAL, os.path.join(DATA_DIR, 'social.gpickle'))

SOCIAL = nx.read_gpickle(os.path.join(DATA_DIR, 'social.gpickle'))
print 'Social graph loaded in:', time.time() - start
print nx.info(SOCIAL)



In [4]:

    
start = time.time()
# SOCIAL_RE = nx.reverse(SOCIAL)
# SOCIAL_RE.name = 'Higgs SOCIAL reversed'
# nx.write_gpickle(SOCIAL_RE, os.path.join(DATA_DIR, 'social_re.gpickle'))
SOCIAL_RE = nx.read_gpickle(os.path.join(DATA_DIR, 'social_re.gpickle'))
print 'Social reversed graph loaded in:', time.time() - start
print nx.info(SOCIAL_RE)









    



Social reversed graph loaded in: 86.8898489475
Name: Higgs SOCIAL reversed
Type: DiGraph
Number of nodes: 456626
Number of edges: 14855842
Average in degree:  32.5339
Average out degree:  32.5339

Analysis

Period I: Before the announcement on 2nd July, there were some rumors about the discovery of a Higgs-like boson at Tevatron;

Period II: On 2nd July at 1 PM GMT, scientists from CDF and D0 experiments, based at Tevatron, presented results indicating that the Higgs particle should have a mass between 115 and 135 GeV/c2 (corresponding to about 123-144 times the mass of the proton) [7];

Period III: After 2nd July and before 4th of July there were many rumors about the Higgs boson dis- covery at LHC [8];

Period IV: The main event was the announce- ment on 4th July at 8 AM GMT by the scientists from the ATLAS and CMS experiments, based at CERN, presenting results indicating the existence of a new particle, compatible with the Higgs bo- son, with mass around 125 GeV/c2 [9, 10]. After 4th July, popular media covered the event.

Extract causal multilayer graph activated components

The graph is already given by the activity. Thus we don't use our framework is this special case.



In [5]:

    
ACTIVITY = parse_activity(os.path.join(DATA_DIR, 'HiggsDiscovery_multiplex_time.txt'), reverse=True)
# COMPS = create_activated_components(ACTIVITY, 10, 'min', 'RT', SOCIAL_RE)

Filter components and extract global statistics



In [ ]:

    
FILT_COMPS = COMPS[COMPS['node_count'] > 20]  # filter too small components
C = FILT_COMPS.set_index('start', drop=False).sort_index()
# Resample do get statistics (averages the values)
C.resample('12H')



In [ ]:

    
fig = plot_spatial_spread(FILT_COMPS.iloc[0]['component'])
show(fig)

Create "classic" causal multilayer with our framework



In [19]:

    
def create_signal(input_df, freq=1, unit='min', action='RT', subset=None):
    df = input_df.copy()
    if action != '' and action is not None:
        df = df[df['action'] == action]
        
    df, unit = round_df(df, freq, unit)
    df, delta_ts = create_layers(df, freq, unit)
    df.reset_index(drop=False, inplace=True)

    res = None
    if subset is None:
        src_df = df[['src_id', 'layer', 'timestamp']].rename(columns = {'src_id':'base_id'})
        tgt_df = df[['tgt_id', 'layer', 'timestamp']].rename(columns = {'tgt_id':'base_id'})

        tgt_df['layer'] = tgt_df['layer'].apply(lambda x: x + 1)
        tgt_df['timestamp'] = tgt_df['timestamp'].apply(lambda x: x + delta_ts)

        res = pd.concat([src_df, tgt_df], ignore_index=True) \
                    .sort('layer').reset_index(drop=True).drop_duplicates()
    
    else:
        subset_df = df[[subset, 'layer', 'timestamp']].rename(columns = {subset: 'base_id'})
        res = subset_df.sort('layer').reset_index(drop=True).drop_duplicates()
    
    return res


def get_period_span_gl(df, signal):
    
    def impl(row):
        min_layer = layer_to_ts[np.min(row)]
        max_layer = layer_to_ts[np.max(row)]
        return [min_layer, max_layer]
    
    layer_to_ts = dict(signal[['layer', 'timestamp']].values)
    
    period_series = df['layers'].apply(impl)
    df['start'] = period_series.apply(lambda x: x[0])
    df['end'] = period_series.apply(lambda x: x[1])
    return df


def extract_components_gl(signal, graph, create_self_edge=True):
    node_signal = sptgraph.utils.from_pandas(signal)
    h, max_id = sptgraph.create_spatio_temporal_graph(graph, node_signal, create_self_edge, 'base_id', 'layer')
    h = sptgraph.find_connected_components(h)
    comps = sptgraph.get_component_sframe(h, 'base_id', 'layer')
    return get_period_span_gl(comps, signal), h, max_id



In [8]:

    
SOCIAL_GL_RE = sptgraph.utils.networkx_to_graphlab(SOCIAL_RE)

First scale 10 min RT

Using only tgt_id as activation



In [20]:

    
signal_10min_rt_tgt = create_signal(ACTIVITY, 10, 'min', 'RT', subset='tgt_id')
gl_comps_10min_rt_tgt, H, max_id = extract_components_gl(signal_10min_rt_tgt, SOCIAL_GL_RE)
layer_to_ts = dict(signal_10min_rt_tgt[['layer', 'timestamp']].values)









    



Create spatio-temporal graph
Create node signal
Create node signal done in: 2.12166500092 seconds
Create signal graph
Create signal graph done in: 27.1102449894 seconds
Create spatio-temporal graph done in: 63.8228449821 seconds



In [21]:

    
gl_comps_10min_rt_tgt[['component_id', 'node_count', 'width', 'height', 'start', 'end']]









    Out[21]:





    
        component_id
        node_count
        width
        height
        start
        end
    
    
        650
        55037
        108
        36800
        2012-07-04 03:10:00+00:00
        2012-07-04 21:00:00+00:00
    
    
        483
        357
        15
        324
        2012-07-03 17:40:00+00:00
        2012-07-03 20:00:00+00:00
    
    
        9736
        299
        12
        277
        2012-07-05 11:30:00+00:00
        2012-07-05 13:20:00+00:00
    
    
        9398
        254
        12
        231
        2012-07-05 05:00:00+00:00
        2012-07-05 06:50:00+00:00
    
    
        9113
        244
        9
        235
        2012-07-05 00:00:00+00:00
        2012-07-05 01:20:00+00:00
    
    
        138
        232
        12
        212
        2012-07-02 16:20:00+00:00
        2012-07-02 18:10:00+00:00
    
    
        551
        200
        14
        163
        2012-07-03 21:00:00+00:00
        2012-07-03 23:10:00+00:00
    
    
        7705
        169
        5
        107
        2012-07-04 14:40:00+00:00
        2012-07-04 15:20:00+00:00
    
    
        9688
        166
        9
        160
        2012-07-05 10:30:00+00:00
        2012-07-05 11:50:00+00:00
    
    
        8932
        142
        9
        128
        2012-07-04 20:50:00+00:00
        2012-07-04 22:10:00+00:00
    
    
        ...
        ...
        ...
        ...
        ...
        ...
    

[14006 rows x 6 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

Plot spatial spread of biggest component trough time



In [11]:

    
big_comp = gl_comps_10min_rt_tgt[0]



In [22]:

    
def plot_component_spatial_spread(comp, layer_to_ts, size=(10, 10), savepath=None):
    hist = Counter(sorted(big_comp['layers']))
    x, y = zip(*sorted(hist.items(), key=operator.itemgetter(0)))
    
    comp_time = map(lambda t: (t, layer_to_ts[t]), x) 
    # Get xticks by hour
    xticks, labels = zip(*[(k, v.strftime('%H-%M')) for (k, v) in comp_time if v.minute == 0])

    fig = plt.figure(0)
    fig.set_size_inches(size)
    ax = plt.plot(x, y)
    plt.xticks(xticks, labels, rotation=45)
    
    plt.xlabel('Time')
    plt.ylabel('Number of retweets')
    plt.title('Spatial spread of component {}'.format(comp['component_id']))
    plt.show()

# plot_component_spatial_spread(big_comp, layer_to_ts)

Extract the biggest component subgraph for further analysis



In [92]:

    
g = sptgraph.component_to_networkx(big_comp, H, layer_to_ts)
print nx.info(g)









    



Name: Component 650
Type: DiGraph
Number of nodes: 55037
Number of edges: 106895
Average in degree:   1.9422
Average out degree:   1.9422



In [80]:

    
g_un = g.to_undirected()
partition = community.best_partition(g_un)



In [81]:

    
print 'Number of communities:', len(set(partition.values()))









    



Number of communities: 166



In [82]:

    
print 'Modularity:', community.modularity(partition, g_un)









    



Modularity: 0.829348352906

Export component from plotting using Gephi



In [93]:

    
path = os.path.join(DATA_DIR, g.name)
# nx.write_edgelist(g, path + '.csv', data=False)
nx.write_graphml(g, path + '.graphml')

Another scale 1min RT

Using only tgt_id



In [ ]:

    
signal_1min_rt_tgt = create_signal(ACTIVITY, 1, 'min', 'RT', subset='tgt_id')
gl_comps_1min_rt_tgt = extract_components_gl(signal_1min_rt_tgt, SOCIAL_GL_RE)



In [ ]:

    
gl_comps_1min_rt_tgt[['component_id', 'node_count', 'width', 'height', 'start', 'end']]



In [ ]:

    
def overlap_graph(g1, g2):
    common_edges = 0
    for u, v in g1.edges_iter():
        if g2.has_edge(u, v):
            common_edges += 1

    res = common_edges * 100 / float(nx.number_of_edges(g1))
    print 'Percentage of overlap (', g1.name, ',', g2.name, '):', res
    print 'Number of common edges:', common_edges
    return res, common_edges

# overlap_graph(REPLY, SOCIAL)
overlap_graph(RETWEET, SOCIAL)
# overlap_graph(MENTION, SOCIAL)

component_id	node_count	width	height	start	end
650	55037	108	36800	2012-07-04 03:10:00+00:00	2012-07-04 21:00:00+00:00
483	357	15	324	2012-07-03 17:40:00+00:00	2012-07-03 20:00:00+00:00
9736	299	12	277	2012-07-05 11:30:00+00:00	2012-07-05 13:20:00+00:00
9398	254	12	231	2012-07-05 05:00:00+00:00	2012-07-05 06:50:00+00:00
9113	244	9	235	2012-07-05 00:00:00+00:00	2012-07-05 01:20:00+00:00
138	232	12	212	2012-07-02 16:20:00+00:00	2012-07-02 18:10:00+00:00
551	200	14	163	2012-07-03 21:00:00+00:00	2012-07-03 23:10:00+00:00
7705	169	5	107	2012-07-04 14:40:00+00:00	2012-07-04 15:20:00+00:00
9688	166	9	160	2012-07-05 10:30:00+00:00	2012-07-05 11:50:00+00:00
8932	142	9	128	2012-07-04 20:50:00+00:00	2012-07-04 22:10:00+00:00
...	...	...	...	...	...

Higgs dataset experiment

Optionnal use bokeh to plot interactive components

Helper functions

Parse data

Parse retweet graph

Parse mention graph

Parse reply graph

Parse social network (follower network)

Analysis

Extract causal multilayer graph activated components

Filter components and extract global statistics

Create "classic" causal multilayer with our framework

First scale 10 min RT

Using only tgt_id as activation

Plot spatial spread of biggest component trough time

Extract the biggest component subgraph for further analysis

Export component from plotting using Gephi

Another scale 1min RT

Using only tgt_id

Overlap between retweet and social network